In [1]:
import warnings
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")
In [2]:
# Importem llibreries
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
In [3]:
# Guardem en una variable el path del dataset
path_scatter = "data/gym_members_exercise_tracking.csv"

# Generem el dataframe
df_scatter = pd.read_csv(path_scatter) # Inspecció inicial 

# Veiem una distribució de les variables numèriques
df_scatter.describe()
Out[3]:
Age Weight (kg) Height (m) Max_BPM Avg_BPM Resting_BPM Session_Duration (hours) Calories_Burned Fat_Percentage Water_Intake (liters) Workout_Frequency (days/week) Experience_Level BMI
count 973.000000 973.000000 973.00000 973.000000 973.000000 973.000000 973.000000 973.000000 973.000000 973.000000 973.000000 973.000000 973.000000
mean 38.683453 73.854676 1.72258 179.883864 143.766701 62.223022 1.256423 905.422405 24.976773 2.626619 3.321686 1.809866 24.912127
std 12.180928 21.207500 0.12772 11.525686 14.345101 7.327060 0.343033 272.641516 6.259419 0.600172 0.913047 0.739693 6.660879
min 18.000000 40.000000 1.50000 160.000000 120.000000 50.000000 0.500000 303.000000 10.000000 1.500000 2.000000 1.000000 12.320000
25% 28.000000 58.100000 1.62000 170.000000 131.000000 56.000000 1.040000 720.000000 21.300000 2.200000 3.000000 1.000000 20.110000
50% 40.000000 70.000000 1.71000 180.000000 143.000000 62.000000 1.260000 893.000000 26.200000 2.600000 3.000000 2.000000 24.160000
75% 49.000000 86.000000 1.80000 190.000000 156.000000 68.000000 1.460000 1076.000000 29.300000 3.100000 4.000000 2.000000 28.560000
max 59.000000 129.900000 2.00000 199.000000 169.000000 74.000000 2.000000 1783.000000 35.000000 3.700000 5.000000 3.000000 49.840000
In [4]:
# Veiem les primeres files del conjunt
df_scatter.head()
Out[4]:
Age Gender Weight (kg) Height (m) Max_BPM Avg_BPM Resting_BPM Session_Duration (hours) Calories_Burned Workout_Type Fat_Percentage Water_Intake (liters) Workout_Frequency (days/week) Experience_Level BMI
0 56 Male 88.3 1.71 180 157 60 1.69 1313.0 Yoga 12.6 3.5 4 3 30.20
1 46 Female 74.9 1.53 179 151 66 1.30 883.0 HIIT 33.9 2.1 4 2 32.00
2 32 Female 68.1 1.66 167 122 54 1.11 677.0 Cardio 33.4 2.3 4 2 24.71
3 25 Male 53.2 1.70 190 164 56 0.59 532.0 Strength 28.8 2.1 3 1 18.41
4 38 Male 46.1 1.79 188 158 68 0.64 556.0 Strength 29.2 2.8 3 1 14.39
In [5]:
# Creem un scatterplot que mostra les calories cremades per cada hora d'entrenament
plt.figure(figsize=(8, 8))
sns.scatterplot(data=df_scatter, x='Session_Duration (hours)', 
                y='Calories_Burned',
                hue = 'Gender',
                 alpha=0.6)
plt.title('Calories cremades per Duració de l\'entrenament')
plt.xlabel('Duració de l\'entrenament en Hores')
plt.ylabel('Calories cremades')
plt.show()
No description has been provided for this image
In [6]:
# Guardem en una variable el path del dataset
path_sunburst = "data/udemy_online_education_courses_dataset.csv"

# Generem el dataframe
df_sunburst = pd.read_csv(path_sunburst) # Inspecció inicial 

# Veiem una distribució de les variables numèriques
df_sunburst.describe()
Out[6]:
course_id price num_subscribers num_reviews num_lectures content_duration
count 3.678000e+03 3678.000000 3678.000000 3678.000000 3678.000000 3678.000000
mean 6.759720e+05 66.049483 3197.150625 156.259108 40.108755 4.094517
std 3.432732e+05 61.005755 9504.117010 935.452044 50.383346 6.053840
min 8.324000e+03 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4.076925e+05 20.000000 111.000000 4.000000 15.000000 1.000000
50% 6.879170e+05 45.000000 911.500000 18.000000 25.000000 2.000000
75% 9.613555e+05 95.000000 2546.000000 67.000000 45.750000 4.500000
max 1.282064e+06 200.000000 268923.000000 27445.000000 779.000000 78.500000
In [7]:
# Veiem les primeres files del conjunt
df_sunburst.head()
Out[7]:
course_id course_title url is_paid price num_subscribers num_reviews num_lectures level content_duration published_timestamp subject
0 1070968 Ultimate Investment Banking Course https://www.udemy.com/ultimate-investment-bank... True 200 2147 23 51 All Levels 1.5 2017-01-18T20:58:58Z Business Finance
1 1113822 Complete GST Course & Certification - Grow You... https://www.udemy.com/goods-and-services-tax/ True 75 2792 923 274 All Levels 39.0 2017-03-09T16:34:20Z Business Finance
2 1006314 Financial Modeling for Business Analysts and C... https://www.udemy.com/financial-modeling-for-b... True 45 2174 74 51 Intermediate Level 2.5 2016-12-19T19:26:30Z Business Finance
3 1210588 Beginner to Pro - Financial Analysis in Excel ... https://www.udemy.com/complete-excel-finance-c... True 95 2451 11 36 All Levels 3.0 2017-05-30T20:07:24Z Business Finance
4 1011058 How To Maximize Your Profits Trading Options https://www.udemy.com/how-to-maximize-your-pro... True 200 1276 45 26 Intermediate Level 2.0 2016-12-13T14:57:18Z Business Finance
In [8]:
# Afegim el nom de la plataforma d'estudi
df_sunburst['Platform'] = "Udemy"

# Agrupem el conjunt de dades per plataforma, subjecte i nivell
df_grouped = df_sunburst.groupby(['Platform','subject', 'level']).agg(
    total_subscribers=('num_subscribers', 'sum')  # Sumem el nombre total de suscriptors
).reset_index()

# Creeem el gràfic sunburst
fig = px.sunburst(df_grouped, 
                  path=['Platform','subject', 'level'], 
                  values='total_subscribers', 
                  title='Nombre de Subscriptors a Udemy Online per Subjecte i Nivell')

# Definim la mida del gràfic
fig.update_layout(width=800, height=800)

# Mostrem el gràfic
fig.show()
In [9]:
# Guardem en una variable el path del dataset
path_ridgeline = "data/NorwayMeteoDataCompleted.csv"

# Generem el dataframe
df_ridgeline = pd.read_csv(path_ridgeline) # Inspecció inicial 

# Veiem una distribució de les variables numèriques
df_ridgeline.describe()
Out[9]:
Unnamed: 0 latitude longtitude max(air_temperature P1D) max(relative_humidity P1D) max(wind_speed P1D) mean(air_temperature P1D) mean(relative_humidity P1D) mean(wind_speed P1D) sum(precipitation_amount P1D) day month year
count 237629.000000 237629.000000 237629.000000 44766.000000 36965.000000 35260.000000 63551.000000 37411.000000 35260.000000 107937.000000 237629.000000 237629.000000 237629.000000
mean 118814.000000 60.120401 13.299210 7.296911 89.019965 7.343434 4.160783 75.800406 4.169682 2.405301 15.728215 6.521414 2015.562053
std 68597.727896 6.994602 8.083162 9.082600 9.875235 4.809745 8.169349 13.286090 3.328436 7.006714 8.797737 3.448545 3.433181
min 0.000000 36.666111 -4.482222 -25.700000 31.000000 0.000000 -27.950000 20.000000 0.000000 -1.000000 1.000000 1.000000 2010.000000
25% 59407.000000 58.990000 8.094700 0.700000 85.000000 3.900000 -1.200000 67.000000 1.800000 0.000000 8.000000 4.000000 2013.000000
50% 118814.000000 60.863200 10.976200 7.000000 92.000000 6.100000 4.200000 77.000000 3.200000 0.100000 16.000000 7.000000 2016.000000
75% 178221.000000 63.554000 19.197778 14.037500 96.000000 9.700000 10.300000 86.000000 5.500000 2.100000 23.000000 10.000000 2019.000000
max 237628.000000 70.335700 35.106944 36.200000 107.000000 35.500000 42.600000 100.000000 29.900000 928.000000 31.000000 12.000000 2021.000000
In [10]:
# Veiem les primeres files del conjunt
df_ridgeline.head()
Out[10]:
Unnamed: 0 sourceId latitude longtitude max(air_temperature P1D) max(relative_humidity P1D) max(wind_speed P1D) mean(air_temperature P1D) mean(relative_humidity P1D) mean(wind_speed P1D) sum(precipitation_amount P1D) day month year
0 0 SN100 61.134900 12.503900 NaN NaN NaN NaN NaN NaN 0.4 1 1 2010
1 1 SN1135 58.990000 11.540800 NaN NaN NaN NaN NaN NaN NaN 1 1 2010
2 2 SN1151800 50.100278 14.255556 NaN NaN NaN NaN NaN NaN NaN 1 1 2010
3 3 SN15262 61.674000 8.368500 NaN NaN NaN NaN NaN NaN NaN 1 1 2010
4 4 SN1531000 45.473056 28.032222 NaN NaN NaN NaN NaN NaN NaN 1 1 2010
In [11]:
# Fem un mapeig dels mesos per mostrar el seu nom en català
month_mapping = {
    1: "Gener", 2: "Febrer", 3: "Març", 4: "Abril",
    5: "Maig", 6: "Juny", 7: "Juliol", 8: "Agost",
    9: "Setembre", 10: "Octubre", 11: "Novembre", 12: "Desembre"
}
In [12]:
# Apliquem el mapeig dels números als noms de mesos
df_ridgeline['month'] = df_ridgeline['month'].map(month_mapping)
In [13]:
# Calculem la mitjana de temperatura per mes, per millorar la visualització amb el color
month_mean_serie = df_ridgeline.groupby('month')['mean(air_temperature P1D)'].mean()

# Afegim la columna 'mean_month' que conté la mitjana de temperatura mensual
df_ridgeline['mean_month'] = df_ridgeline['month'].map(month_mean_serie)
In [14]:
# Configurem de la paleta de colors per als mesos
pal = sns.color_palette("coolwarm", n_colors=12)

# Creem el ridgeline plot
g = sns.FacetGrid(df_ridgeline, row='month', hue='mean_month', aspect=15, height=0.75, palette=pal)

g.map(sns.kdeplot, 'mean(air_temperature P1D)', 
      bw_adjust=1, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, 'mean(air_temperature P1D)', 
      bw_adjust=1, clip_on=False, 
      color="w", lw=2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Afegim el nom de cada mes en el costat esquerre de cada gràfic
for i, ax in enumerate(g.axes.flat):
    ax.text(df_ridgeline['mean(air_temperature P1D)'].min() - 5, 0.02, ax.get_title().split('=')[1].strip(),
            fontweight='bold', fontsize=12,
            color=ax.lines[-1].get_color())

# Ajustem els estils
g.set_titles("")
g.set(yticks=[])
g.set_ylabels()
g.despine(bottom=True, left=True)

# Configurem el títol general i l'etiqueta de l'eix X
plt.xlabel('Temperatura Mitjana de l\'Aire (°C)', fontweight='bold', fontsize=12)

# Centrem el títol general
g.fig.suptitle('Distribució de la Temperatura Mitjana de l\'Aire per Mes a Noruega (2010-2021)', 
               ha='center', fontsize=16, fontweight='bold')

plt.show()
No description has been provided for this image

Enllaços als datasets¶

Scatterplot: https://www.kaggle.com/datasets/valakhorasani/gym-members-exercise-dataset¶

Sunburst: https://www.kaggle.com/datasets/yusufdelikkaya/udemy-online-education-courses¶

Ridgeline: https://www.kaggle.com/datasets/annbengardt/noway-meteorological-data¶

In [ ]: